If you don’t have them already installed you need to download and install R and R studio
install.packages("Pkg Name"), e.g., install.packages("tidyverse")
library function, e.g., library(tidyverse)
#
# will be ignored?function_name
??term
library(tidyverse)
# mpg is a ggplot dataset of fuel economy data from 1999 and 2008 for 38 popular models of car
mpg## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
## 4 audi a4 2.0 2008 4 auto(av) f 21 30 p compact
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p compact
## 7 audi a4 3.1 2008 6 auto(av) f 18 27 p compact
## 8 audi a4 quattro 1.8 1999 4 manual(m5) 4 18 26 p compact
## 9 audi a4 quattro 1.8 1999 4 auto(l5) 4 16 25 p compact
## 10 audi a4 quattro 2.0 2008 4 manual(m6) 4 20 28 p compact
## # ... with 224 more rows
summary(mpg)## manufacturer model displ year cyl trans
## Length:234 Length:234 Min. :1.600 Min. :1999 Min. :4.000 Length:234
## Class :character Class :character 1st Qu.:2.400 1st Qu.:1999 1st Qu.:4.000 Class :character
## Mode :character Mode :character Median :3.300 Median :2004 Median :6.000 Mode :character
## Mean :3.472 Mean :2004 Mean :5.889
## 3rd Qu.:4.600 3rd Qu.:2008 3rd Qu.:8.000
## Max. :7.000 Max. :2008 Max. :8.000
## drv cty hwy fl class
## Length:234 Min. : 9.00 Min. :12.00 Length:234 Length:234
## Class :character 1st Qu.:14.00 1st Qu.:18.00 Class :character Class :character
## Mode :character Median :17.00 Median :24.00 Mode :character Mode :character
## Mean :16.86 Mean :23.44
## 3rd Qu.:19.00 3rd Qu.:27.00
## Max. :35.00 Max. :44.00
mpg$drv## [1] "f" "f" "f" "f" "f" "f" "f" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "r" "r" "r" "r" "r" "r" "r" "r" "r" "r"
## [29] "4" "4" "4" "4" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "4" "4" "4" "4" "4" "4" "4" "4"
## [57] "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "r" "r" "r" "4" "4" "4" "4" "4" "4" "4"
## [85] "4" "4" "4" "4" "4" "4" "r" "r" "r" "r" "r" "r" "r" "r" "r" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f"
## [113] "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "r" "r" "r" "4" "4" "4"
## [141] "4" "f" "f" "f" "f" "f" "f" "f" "f" "f" "4" "4" "4" "4" "f" "f" "f" "f" "f" "4" "4" "4" "4" "4" "4" "4" "4" "4"
## [169] "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "4" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f"
## [197] "f" "f" "4" "4" "4" "4" "4" "4" "4" "4" "4" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f" "f"
## [225] "f" "f" "f" "f" "f" "f" "f" "f" "f" "f"
The ggplot command begins a plot to which you can add layers
ggplot(data = mpg) + geom_point(mapping = aes(x = displ, y = hwy))ggplot(data = mpg) + geom_smooth(mapping = aes(x = displ, y = hwy)) + geom_point(mapping = aes(x = displ, y = hwy))By inserting x and y into ggplot, all layers will use those parameters unless otherwise specified
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_smooth() +
geom_point()ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_smooth() +
geom_point() +
geom_smooth(mapping = aes(x = displ,y = cty),color = "red")Note: if you want to plot a variable to a feature like color or size, it must go in the aes() term, if you just want to set them at a certain value they go outside the aes
ggplot(mpg, aes(displ,hwy)) +
geom_smooth(color = "green") +
geom_point(aes(color=class),size=2)You can also split the plot into subplots based on a varible using facet
ggplot(mpg, aes(displ, hwy)) + geom_point() + facet_grid(. ~ cyl)ggplot(mpg, aes(displ, hwy)) + geom_point() + facet_grid(cyl ~ .)ggplot(mpg, aes(displ, hwy)) + geom_point() + facet_wrap(~cyl)ggplot(mpg, aes(displ, hwy)) + geom_point() + facet_grid(cyl ~ class)Try to make some of the following plots:
This time we will be using the diamonds dataset
summary(diamonds)## carat cut color clarity depth table price
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00 Min. :43.00 Min. : 326
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80 Median :57.00 Median : 2401
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75 Mean :57.46 Mean : 3933
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00 Max. :95.00 Max. :18823
## J: 2808 (Other): 2531
## x y z
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.710 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.700 Median : 5.710 Median : 3.530
## Mean : 5.731 Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :10.740 Max. :58.900 Max. :31.800
##
ggplot(diamonds, aes(cut)) + geom_bar()ggplot(diamonds, aes(price)) + geom_histogram(bins = 100)ggplot(diamonds) + geom_bar(aes(x = cut, y = ..prop.., group = 1))ggplot(diamonds) + geom_bar(aes(x = color, fill = cut), position = "dodge")ggplot(diamonds) + geom_bar(aes(x = color, fill = cut), position = "fill")ggplot(diamonds) + geom_bar(aes(x = color, color = cut), position = "stack", fill = NA)plots from http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=F) + xlim(c(0, 0.1)) + ylim(c(0, 500000)) +
labs(subtitle="Area Vs Population", y="Population", x="Area", title="Scatterplot", caption = "Source: midwest")mtcars$`car name` <- rownames(mtcars) # create new column for car names
mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2) # compute normalized mpg
mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above") # above / below avg flag
mtcars <- mtcars[order(mtcars$mpg_z), ] # sort
mtcars$`car name` <- factor(mtcars$`car name`, levels = mtcars$`car name`) # convert to factor to retain sorted order in plot.
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) +
geom_bar(stat='identity', aes(fill=mpg_type), width=.5) +
scale_fill_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
labs(subtitle="Normalised mileage from 'mtcars'", title= "Diverging Bars") +
coord_flip()# prep data
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv")
colnames(df) <- c("continent", "1952", "1957")
left_label <- paste(df$continent, round(df$`1952`),sep=", ")
right_label <- paste(df$continent, round(df$`1957`),sep=", ")
df$class <- ifelse((df$`1957` - df$`1952`) < 0, "red", "green")
ggplot(df) + geom_segment(aes(x=1, xend=2, y=`1952`, yend=`1957`, col=class), size=.75, show.legend=F) +
geom_vline(xintercept=1, linetype="dashed", size=.1) +
geom_vline(xintercept=2, linetype="dashed", size=.1) +
scale_color_manual(labels = c("Up", "Down"),
values = c("green"="#00ba38", "red"="#f8766d")) + # color of lines
labs(x="", y="Mean GdpPerCap") + # Axis labels
xlim(.5, 2.5) + ylim(0,(1.1*(max(df$`1952`, df$`1957`)))) + # X and Y axis limits
geom_text(label=left_label, y=df$`1952`, x=rep(1, NROW(df)), hjust=1.1, size=3.5) +
geom_text(label=right_label, y=df$`1957`, x=rep(2, NROW(df)), hjust=-0.1, size=3.5) +
geom_text(label="Time 1", x=1, y=1.1*(max(df$`1952`, df$`1957`)), hjust=1.2, size=5) + # title
geom_text(label="Time 2", x=2, y=1.1*(max(df$`1952`, df$`1957`)), hjust=-0.1, size=5) + # title
theme(panel.background = element_blank(),panel.grid = element_blank(),axis.ticks = element_blank(),
axis.text.x = element_blank(),panel.border = element_blank(),plot.margin = unit(c(1,2,1,2), "cm"))ggplot(mpg, aes(cty)) +
geom_density(aes(fill=factor(cyl)), alpha=0.8) +
labs(title="Density plot",
subtitle="City Mileage Grouped by Number of cylinders",
caption="Source: mpg",
x="City Mileage",
fill="# Cylinders")ggplot(mpg, aes(class, cty)) +
geom_violin() +
labs(title="Violin plot",
subtitle="City Mileage vs Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage")theme_set(theme_classic())
df <- as.data.frame(table(mpg$class))
colnames(df) <- c("class", "freq")
ggplot(df, aes(x = "", y=freq, fill = factor(class))) +
geom_bar(width = 1, stat = "identity") +
theme(axis.line = element_blank(),
plot.title = element_text(hjust=0.5)) +
labs(fill="class",
x=NULL,
y=NULL,
title="Pie Chart of class",
caption="Source: mpg") +
coord_polar(theta = "y", start=0)EncSz <- 25
SynPermCon <- 0.5
PtPrcnt <- 0.75
SPSmpSz <- round(EncSz^2*PtPrcnt)
ENC <- rep(.3,EncSz^2)
ENC[c(19:83,200:250,353:420,497:585)] <- 1
SPEncBoxes <- tibble(x = rep(c(1:EncSz),EncSz), y = sort(rep(c(1:EncSz),EncSz)))
j <- rep(NA,EncSz^2)
j[sample(EncSz^2,SPSmpSz)] <- rnorm(SPSmpSz,mean=.9*SynPermCon,sd=SynPermCon/5)
j2 <- rep(NA,EncSz^2)
j2[j>0.5] <- 1
j2[j>0.5 & ENC ==1] <- 2
j2[is.na(j)] <- NA
EncAct <- rep(0.1,EncSz^2)
EncAct[j>SynPermCon] <- 1
j <- cut(j,breaks = c(-Inf,seq(0.4,0.6,0.025),Inf))
BlnkGrph = theme(axis.line=element_blank(), axis.text.x=element_blank(), axis.text.y=element_blank(), axis.ticks=element_blank(), axis.title.x=element_blank(),
axis.title.y=element_blank(), legend.position="none", panel.background=element_blank(), panel.border=element_blank(), panel.grid.major=element_blank(),
panel.grid.minor=element_blank(), plot.background=element_blank(),plot.margin=grid::unit(c(0,0,0,0), "mm"))
SPEncBoxes %>% ggplot(aes(x,y,fill = factor(round(ENC)))) +
geom_tile(color = "gray",show.legend=FALSE) + BlnkGrph + coord_fixed() +
geom_point(aes(x,y, color = factor(j2)),shape = 16,na.rm=TRUE, size = 3) +
scale_fill_manual(values = c("white","blue")) + scale_shape_identity() +
scale_color_manual(values = c("black","green"))SPEncBoxes %>% ggplot(aes(x,y,fill = j, color = EncAct)) +
geom_tile(show.legend=FALSE, size = 0.2,alpha=EncAct) + BlnkGrph + coord_fixed() +
scale_color_gradient(low="gray",high ="black") +
scale_fill_manual(values = c("red","red","red","red","orangered", "orange","yellow","lightgreen","green1","green1"),na.value="white")-sqrt(25) + (5 + 3)/4 * 7 - 2^2## [1] 5
5%/%3 # Integer Division## [1] 1
5%%3 # Modulo (remainder after division)## [1] 2
5 == 6## [1] FALSE
5 != 6## [1] TRUE
83 > (25 >= 23)## [1] TRUE
5 > 3 & 3 < 2## [1] FALSE
5 > 3 | 3 < 2## [1] TRUE
1:4## [1] 1 2 3 4
c(5, 3, 2, 1) # Creates a vector via concatenation (hence the c)## [1] 5 3 2 1
c(12, 1:4, 6)## [1] 12 1 2 3 4 6
seq(from = 1, t = 10, by = 2) # Creates a vector with the given paramters## [1] 1 3 5 7 9
seq(1, 10, 2) # creates the same vector without naming the paramters## [1] 1 3 5 7 9
seq(1, 10) # R uses the default values for any empty parameters## [1] 1 2 3 4 5 6 7 8 9 10
seq(to = 10, by = 2)## [1] 1 3 5 7 9
seq(by = 2, to = 10)## [1] 1 3 5 7 9
c(seq(1, 10, 2), 25, 10)## [1] 1 3 5 7 9 25 10
c(seq(1, 10, 2), 25, 10) > 12## [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE
c(seq(1, 10, 2), 25, 10) * 2## [1] 2 6 10 14 18 50 20
x = 5 + 3
(x = 5 + 3)## [1] 8
x <- 5 + 3
(x <- 5 + 3)## [1] 8
<- is the assignment operatory <- x
y## [1] 8
x <- 5 + 3 > 2
x## [1] TRUE
x <- seq(172, 23, -13)
x## [1] 172 159 146 133 120 107 94 81 68 55 42 29
seq, use ()x <- seq(172, 23, -13)
x[1]## [1] 172
x[c(1, 3)]## [1] 172 146
x[2:4]## [1] 159 146 133
x[4:2]## [1] 133 146 159
x[]## [1] 172 159 146 133 120 107 94 81 68 55 42 29
x[-1]## [1] 159 146 133 120 107 94 81 68 55 42 29
x[-c(1, 3)]## [1] 159 133 120 107 94 81 68 55 42 29
x[x%%2 == 0]## [1] 172 146 120 94 68 42
y <- x[x%%2 == 0]
y[9] <- 10
y## [1] 172 146 120 94 68 42 NA NA 10
x <- 1:20
mean(x)## [1] 10.5
max(x)## [1] 20
min(x)## [1] 1
length(x)## [1] 20
range(x)## [1] 1 20
prod(x)## [1] 2.432902e+18
var(x)## [1] 35
log(x)## [1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379 1.7917595 1.9459101 2.0794415 2.1972246 2.3025851 2.3978953
## [12] 2.4849066 2.5649494 2.6390573 2.7080502 2.7725887 2.8332133 2.8903718 2.9444390 2.9957323
sqrt(x)## [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427 3.000000 3.162278 3.316625 3.464102
## [13] 3.605551 3.741657 3.872983 4.000000 4.123106 4.242641 4.358899 4.472136
Note that many functions in R have default values for some of their parameters and you should always try to be aware of them even if you don’t want to change them
rnorm(10)## [1] -0.92551880 0.02149644 -0.79360933 -1.15555822 -1.20551451 -0.07993958 0.88575220 2.55462667 -0.03038346
## [10] 0.23839532
rnorm randomly generates values from a normal distribution, but a normal distribution requires a mean and a standard deviation. If you type ?rnorm you will see the full documentation but for our purposes the important part is
rnorm(n, mean = 0, sd = 1)
By default, the rnorm function assumes a mean of 0 and a standard deviation of 1. You can change those values easily, but only if you are aware of them.
When you have data consisting of multiple observations of multiple variables, i.e., a data set, this is most conveniently stored as a dataframe
iris # Famous iris data set which gives the measurements for 50 flowers from each of 3 species of iris## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
summary(iris) # Very useful function, which gives summaries of each variable## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
## Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
dataset$variablenamedataset['variablename']dataset[variable column position]names(iris)## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
iris$Sepal.Length## [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2
## [29] 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7
## [57] 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0
## [85] 5.4 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4
## [113] 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9
## [141] 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
iris["Sepal.Length"]## Sepal.Length
## 1 5.1
## 2 4.9
## 3 4.7
## 4 4.6
## 5 5.0
## 6 5.4
## 7 4.6
## 8 5.0
## 9 4.4
## 10 4.9
## 11 5.4
## 12 4.8
## 13 4.8
## 14 4.3
## 15 5.8
## 16 5.7
## 17 5.4
## 18 5.1
## 19 5.7
## 20 5.1
## 21 5.4
## 22 5.1
## 23 4.6
## 24 5.1
## 25 4.8
## 26 5.0
## 27 5.0
## 28 5.2
## 29 5.2
## 30 4.7
## 31 4.8
## 32 5.4
## 33 5.2
## 34 5.5
## 35 4.9
## 36 5.0
## 37 5.5
## 38 4.9
## 39 4.4
## 40 5.1
## 41 5.0
## 42 4.5
## 43 4.4
## 44 5.0
## 45 5.1
## 46 4.8
## 47 5.1
## 48 4.6
## 49 5.3
## 50 5.0
## 51 7.0
## 52 6.4
## 53 6.9
## 54 5.5
## 55 6.5
## 56 5.7
## 57 6.3
## 58 4.9
## 59 6.6
## 60 5.2
## 61 5.0
## 62 5.9
## 63 6.0
## 64 6.1
## 65 5.6
## 66 6.7
## 67 5.6
## 68 5.8
## 69 6.2
## 70 5.6
## 71 5.9
## 72 6.1
## 73 6.3
## 74 6.1
## 75 6.4
## 76 6.6
## 77 6.8
## 78 6.7
## 79 6.0
## 80 5.7
## 81 5.5
## 82 5.5
## 83 5.8
## 84 6.0
## 85 5.4
## 86 6.0
## 87 6.7
## 88 6.3
## 89 5.6
## 90 5.5
## 91 5.5
## 92 6.1
## 93 5.8
## 94 5.0
## 95 5.6
## 96 5.7
## 97 5.7
## 98 6.2
## 99 5.1
## 100 5.7
## 101 6.3
## 102 5.8
## 103 7.1
## 104 6.3
## 105 6.5
## 106 7.6
## 107 4.9
## 108 7.3
## 109 6.7
## 110 7.2
## 111 6.5
## 112 6.4
## 113 6.8
## 114 5.7
## 115 5.8
## 116 6.4
## 117 6.5
## 118 7.7
## 119 7.7
## 120 6.0
## 121 6.9
## 122 5.6
## 123 7.7
## 124 6.3
## 125 6.7
## 126 7.2
## 127 6.2
## 128 6.1
## 129 6.4
## 130 7.2
## 131 7.4
## 132 7.9
## 133 6.4
## 134 6.3
## 135 6.1
## 136 7.7
## 137 6.3
## 138 6.4
## 139 6.0
## 140 6.9
## 141 6.7
## 142 6.9
## 143 5.8
## 144 6.8
## 145 6.7
## 146 6.7
## 147 6.3
## 148 6.5
## 149 6.2
## 150 5.9
iris[1]## Sepal.Length
## 1 5.1
## 2 4.9
## 3 4.7
## 4 4.6
## 5 5.0
## 6 5.4
## 7 4.6
## 8 5.0
## 9 4.4
## 10 4.9
## 11 5.4
## 12 4.8
## 13 4.8
## 14 4.3
## 15 5.8
## 16 5.7
## 17 5.4
## 18 5.1
## 19 5.7
## 20 5.1
## 21 5.4
## 22 5.1
## 23 4.6
## 24 5.1
## 25 4.8
## 26 5.0
## 27 5.0
## 28 5.2
## 29 5.2
## 30 4.7
## 31 4.8
## 32 5.4
## 33 5.2
## 34 5.5
## 35 4.9
## 36 5.0
## 37 5.5
## 38 4.9
## 39 4.4
## 40 5.1
## 41 5.0
## 42 4.5
## 43 4.4
## 44 5.0
## 45 5.1
## 46 4.8
## 47 5.1
## 48 4.6
## 49 5.3
## 50 5.0
## 51 7.0
## 52 6.4
## 53 6.9
## 54 5.5
## 55 6.5
## 56 5.7
## 57 6.3
## 58 4.9
## 59 6.6
## 60 5.2
## 61 5.0
## 62 5.9
## 63 6.0
## 64 6.1
## 65 5.6
## 66 6.7
## 67 5.6
## 68 5.8
## 69 6.2
## 70 5.6
## 71 5.9
## 72 6.1
## 73 6.3
## 74 6.1
## 75 6.4
## 76 6.6
## 77 6.8
## 78 6.7
## 79 6.0
## 80 5.7
## 81 5.5
## 82 5.5
## 83 5.8
## 84 6.0
## 85 5.4
## 86 6.0
## 87 6.7
## 88 6.3
## 89 5.6
## 90 5.5
## 91 5.5
## 92 6.1
## 93 5.8
## 94 5.0
## 95 5.6
## 96 5.7
## 97 5.7
## 98 6.2
## 99 5.1
## 100 5.7
## 101 6.3
## 102 5.8
## 103 7.1
## 104 6.3
## 105 6.5
## 106 7.6
## 107 4.9
## 108 7.3
## 109 6.7
## 110 7.2
## 111 6.5
## 112 6.4
## 113 6.8
## 114 5.7
## 115 5.8
## 116 6.4
## 117 6.5
## 118 7.7
## 119 7.7
## 120 6.0
## 121 6.9
## 122 5.6
## 123 7.7
## 124 6.3
## 125 6.7
## 126 7.2
## 127 6.2
## 128 6.1
## 129 6.4
## 130 7.2
## 131 7.4
## 132 7.9
## 133 6.4
## 134 6.3
## 135 6.1
## 136 7.7
## 137 6.3
## 138 6.4
## 139 6.0
## 140 6.9
## 141 6.7
## 142 6.9
## 143 5.8
## 144 6.8
## 145 6.7
## 146 6.7
## 147 6.3
## 148 6.5
## 149 6.2
## 150 5.9
$ identifier, then using brackets after the variable name to indicate specific position(s)iris$Sepal.Length[25:30]## [1] 4.8 5.0 5.0 5.2 5.2 4.7
iris[25:30, "Sepal.Length"]## [1] 4.8 5.0 5.0 5.2 5.2 4.7
iris[25:30, 1]## [1] 4.8 5.0 5.0 5.2 5.2 4.7
iris[c(25:30, 17, 1), c(1, 4)]## Sepal.Length Petal.Width
## 25 4.8 0.2
## 26 5.0 0.2
## 27 5.0 0.4
## 28 5.2 0.2
## 29 5.2 0.2
## 30 4.7 0.2
## 17 5.4 0.4
## 1 5.1 0.2
iris## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
as_tibble(iris) # shows only a few rows as well as the type of data in each row## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fctr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # ... with 140 more rows
iris$Spec## [1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [11] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [21] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [31] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [41] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
## [51] versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor
## [61] versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor
## [71] versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor
## [81] versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor
## [91] versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor
## [101] virginica virginica virginica virginica virginica virginica virginica virginica virginica virginica
## [111] virginica virginica virginica virginica virginica virginica virginica virginica virginica virginica
## [121] virginica virginica virginica virginica virginica virginica virginica virginica virginica virginica
## [131] virginica virginica virginica virginica virginica virginica virginica virginica virginica virginica
## [141] virginica virginica virginica virginica virginica virginica virginica virginica virginica virginica
## Levels: setosa versicolor virginica
as_tibble(iris)$Spec## Warning: Unknown or uninitialised column: 'Spec'.
## NULL
iris[1]## Sepal.Length
## 1 5.1
## 2 4.9
## 3 4.7
## 4 4.6
## 5 5.0
## 6 5.4
## 7 4.6
## 8 5.0
## 9 4.4
## 10 4.9
## 11 5.4
## 12 4.8
## 13 4.8
## 14 4.3
## 15 5.8
## 16 5.7
## 17 5.4
## 18 5.1
## 19 5.7
## 20 5.1
## 21 5.4
## 22 5.1
## 23 4.6
## 24 5.1
## 25 4.8
## 26 5.0
## 27 5.0
## 28 5.2
## 29 5.2
## 30 4.7
## 31 4.8
## 32 5.4
## 33 5.2
## 34 5.5
## 35 4.9
## 36 5.0
## 37 5.5
## 38 4.9
## 39 4.4
## 40 5.1
## 41 5.0
## 42 4.5
## 43 4.4
## 44 5.0
## 45 5.1
## 46 4.8
## 47 5.1
## 48 4.6
## 49 5.3
## 50 5.0
## 51 7.0
## 52 6.4
## 53 6.9
## 54 5.5
## 55 6.5
## 56 5.7
## 57 6.3
## 58 4.9
## 59 6.6
## 60 5.2
## 61 5.0
## 62 5.9
## 63 6.0
## 64 6.1
## 65 5.6
## 66 6.7
## 67 5.6
## 68 5.8
## 69 6.2
## 70 5.6
## 71 5.9
## 72 6.1
## 73 6.3
## 74 6.1
## 75 6.4
## 76 6.6
## 77 6.8
## 78 6.7
## 79 6.0
## 80 5.7
## 81 5.5
## 82 5.5
## 83 5.8
## 84 6.0
## 85 5.4
## 86 6.0
## 87 6.7
## 88 6.3
## 89 5.6
## 90 5.5
## 91 5.5
## 92 6.1
## 93 5.8
## 94 5.0
## 95 5.6
## 96 5.7
## 97 5.7
## 98 6.2
## 99 5.1
## 100 5.7
## 101 6.3
## 102 5.8
## 103 7.1
## 104 6.3
## 105 6.5
## 106 7.6
## 107 4.9
## 108 7.3
## 109 6.7
## 110 7.2
## 111 6.5
## 112 6.4
## 113 6.8
## 114 5.7
## 115 5.8
## 116 6.4
## 117 6.5
## 118 7.7
## 119 7.7
## 120 6.0
## 121 6.9
## 122 5.6
## 123 7.7
## 124 6.3
## 125 6.7
## 126 7.2
## 127 6.2
## 128 6.1
## 129 6.4
## 130 7.2
## 131 7.4
## 132 7.9
## 133 6.4
## 134 6.3
## 135 6.1
## 136 7.7
## 137 6.3
## 138 6.4
## 139 6.0
## 140 6.9
## 141 6.7
## 142 6.9
## 143 5.8
## 144 6.8
## 145 6.7
## 146 6.7
## 147 6.3
## 148 6.5
## 149 6.2
## 150 5.9
iris[, 1]## [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2
## [29] 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7
## [57] 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0
## [85] 5.4 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4
## [113] 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9
## [141] 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
as_tibble(iris)[1]## # A tibble: 150 x 1
## Sepal.Length
## <dbl>
## 1 5.1
## 2 4.9
## 3 4.7
## 4 4.6
## 5 5.0
## 6 5.4
## 7 4.6
## 8 5.0
## 9 4.4
## 10 4.9
## # ... with 140 more rows
as_tibble(iris)[, 1]## # A tibble: 150 x 1
## Sepal.Length
## <dbl>
## 1 5.1
## 2 4.9
## 3 4.7
## 4 4.6
## 5 5.0
## 6 5.4
## 7 4.6
## 8 5.0
## 9 4.4
## 10 4.9
## # ... with 140 more rows
as_tibble(iris)[[1]]## [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2
## [29] 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7
## [57] 6.3 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0
## [85] 5.4 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4
## [113] 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.0 6.9
## [141] 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
dplyr is one of the packages in tidyverse which provides a consistent set of data manipulation verbs.
filter(iris, Sepal.Length > 4, Petal.Width == 0.1) # Note that the variable names do not have quotes, '', around them## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 4.9 3.1 1.5 0.1 setosa
## 2 4.8 3.0 1.4 0.1 setosa
## 3 4.3 3.0 1.1 0.1 setosa
## 4 5.2 4.1 1.5 0.1 setosa
## 5 4.9 3.6 1.4 0.1 setosa
install.packages("nycflights13"), and load its library, library(nycflights13).library(nycflights13)
flights## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
## # ... with 336,766 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
summary(flights)## year month day dep_time sched_dep_time dep_delay arr_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106 Min. : -43.00 Min. : 1
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906 1st Qu.: -5.00 1st Qu.:1104
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359 Median : -2.00 Median :1535
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344 Mean : 12.64 Mean :1502
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729 3rd Qu.: 11.00 3rd Qu.:1940
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359 Max. :1301.00 Max. :2400
## NA's :8255 NA's :8255 NA's :8713
## sched_arr_time arr_delay carrier flight tailnum origin
## Min. : 1 Min. : -86.000 Length:336776 Min. : 1 Length:336776 Length:336776
## 1st Qu.:1124 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character Class :character
## Median :1556 Median : -5.000 Mode :character Median :1496 Mode :character Mode :character
## Mean :1536 Mean : 6.895 Mean :1972
## 3rd Qu.:1945 3rd Qu.: 14.000 3rd Qu.:3465
## Max. :2359 Max. :1272.000 Max. :8500
## NA's :9430
## dest air_time distance hour minute time_hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Mode :character Median :129.0 Median : 872 Median :13.00 Median :29.00 Median :2013-07-03 10:00:00
## Mean :150.7 Mean :1040 Mean :13.18 Mean :26.23 Mean :2013-07-03 05:02:36
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :695.0 Max. :4983 Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00
## NA's :9430
int integersdbl doubles or real numberschr character vectors (strings)dttm date-timedate datelgl logical (TRUE or FALSE)fctr factors (catgeorical variables with fixed possible values, e.g., dropdown list)list like a vector but can contain different types of elementsflights[c("dep_time", "tailnum", "air_time", "time_hour")]## # A tibble: 336,776 x 4
## dep_time tailnum air_time time_hour
## <int> <chr> <dbl> <dttm>
## 1 517 N14228 227 2013-01-01 05:00:00
## 2 533 N24211 227 2013-01-01 05:00:00
## 3 542 N619AA 160 2013-01-01 05:00:00
## 4 544 N804JB 183 2013-01-01 05:00:00
## 5 554 N668DN 116 2013-01-01 06:00:00
## 6 554 N39463 150 2013-01-01 05:00:00
## 7 555 N516JB 158 2013-01-01 06:00:00
## 8 557 N829AS 53 2013-01-01 06:00:00
## 9 557 N593JB 140 2013-01-01 06:00:00
## 10 558 N3ALAA 138 2013-01-01 06:00:00
## # ... with 336,766 more rows
use filter() to find rows/cases where conditions are true
# Find all flights which went from JFK to Fort Lauderdale in the first week of January
filter(flights, origin == "JFK", dest == "FLL", month == 1, day <= 7)## # A tibble: 106 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 659 700 -1 1008 1007 1 B6 981 N646JB JFK
## 2 2013 1 1 712 715 -3 1023 1035 -12 AA 825 N3ETAA JFK
## 3 2013 1 1 805 800 5 1118 1106 12 B6 3 N570JB JFK
## 4 2013 1 1 933 904 29 1252 1210 42 B6 17 N579JB JFK
## 5 2013 1 1 1153 1123 30 1454 1425 29 B6 1 N552JB JFK
## 6 2013 1 1 1251 1252 -1 1611 1555 16 B6 85 N657JB JFK
## 7 2013 1 1 1452 1457 -5 1753 1811 -18 B6 61 N292JB JFK
## 8 2013 1 1 1527 1530 -3 1841 1855 -14 AA 1039 N3HYAA JFK
## 9 2013 1 1 1610 1615 -5 1913 1948 -35 DL 1411 N947DL JFK
## 10 2013 1 1 1713 1700 13 2006 2014 -8 B6 15 N346JB JFK
## # ... with 96 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
filter(flights, (origin == "JFK" | dest == "FLL"), month == 1, day <= 7) # , is the same as AND and | is the same as OR## # A tibble: 2,340 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
## 2 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
## 3 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 4 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
## 5 2013 1 1 558 600 -2 849 851 -2 B6 49 N793JB JFK
## 6 2013 1 1 558 600 -2 853 856 -3 B6 71 N657JB JFK
## 7 2013 1 1 558 600 -2 924 917 7 UA 194 N29129 JFK
## 8 2013 1 1 559 559 0 702 706 -4 B6 1806 N708JB JFK
## 9 2013 1 1 600 600 0 851 858 -7 B6 371 N595JB LGA
## 10 2013 1 1 606 610 -4 837 845 -8 DL 1743 N3739P JFK
## # ... with 2,330 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
# Find all flights going to Fort Lauderdale, Atlanta or O'Hare
filter(flights, dest == "FLL" | dest == "ATL" | dest == "ORD")## # A tibble: 46,553 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
## 2 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
## 3 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 4 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
## 5 2013 1 1 600 600 0 851 858 -7 B6 371 N595JB LGA
## 6 2013 1 1 600 600 0 837 825 12 MQ 4650 N542MQ LGA
## 7 2013 1 1 606 610 -4 837 845 -8 DL 1743 N3739P JFK
## 8 2013 1 1 608 600 8 807 735 32 MQ 3768 N9EAMQ EWR
## 9 2013 1 1 615 615 0 833 842 -9 DL 575 N326NB EWR
## 10 2013 1 1 629 630 -1 824 810 14 AA 303 N3CYAA LGA
## # ... with 46,543 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dest %in% c("FLL", "ATL", "ORD")) # use %in% to search for multiple values in a single variable## # A tibble: 46,553 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
## 2 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
## 3 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 4 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
## 5 2013 1 1 600 600 0 851 858 -7 B6 371 N595JB LGA
## 6 2013 1 1 600 600 0 837 825 12 MQ 4650 N542MQ LGA
## 7 2013 1 1 606 610 -4 837 845 -8 DL 1743 N3739P JFK
## 8 2013 1 1 608 600 8 807 735 32 MQ 3768 N9EAMQ EWR
## 9 2013 1 1 615 615 0 833 842 -9 DL 575 N326NB EWR
## 10 2013 1 1 629 630 -1 824 810 14 AA 303 N3CYAA LGA
## # ... with 46,543 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# Find all flights that have values for their departure delays
filter(flights, is.na(dep_delay)) # show all the rows with NA (missing values)## # A tibble: 8,255 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 NA 1630 NA NA 1815 NA EV 4308 N18120 EWR
## 2 2013 1 1 NA 1935 NA NA 2240 NA AA 791 N3EHAA LGA
## 3 2013 1 1 NA 1500 NA NA 1825 NA AA 1925 N3EVAA LGA
## 4 2013 1 1 NA 600 NA NA 901 NA B6 125 N618JB JFK
## 5 2013 1 2 NA 1540 NA NA 1747 NA EV 4352 N10575 EWR
## 6 2013 1 2 NA 1620 NA NA 1746 NA EV 4406 N13949 EWR
## 7 2013 1 2 NA 1355 NA NA 1459 NA EV 4434 N10575 EWR
## 8 2013 1 2 NA 1420 NA NA 1644 NA EV 4935 N759EV EWR
## 9 2013 1 2 NA 1321 NA NA 1536 NA EV 3849 N13550 EWR
## 10 2013 1 2 NA 1545 NA NA 1910 NA AA 133 <NA> JFK
## # ... with 8,245 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
filter(flights, !is.na(dep_delay)) # show only the rows without missing values## # A tibble: 328,521 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
## # ... with 328,511 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
use arrange() to sort the data based on one or more variables
# Sort the flights based on their scheduled departure time
arrange(flights, sched_dep_time)## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 7 27 NA 106 NA NA 245 NA US 1632 <NA> EWR
## 2 2013 1 2 458 500 -2 703 650 13 US 1030 N162UW EWR
## 3 2013 1 3 458 500 -2 650 650 0 US 1030 N172US EWR
## 4 2013 1 4 456 500 -4 631 650 -19 US 1030 N186US EWR
## 5 2013 1 5 458 500 -2 640 650 -10 US 1030 N560UW EWR
## 6 2013 1 6 458 500 -2 718 650 28 US 1030 N539UW EWR
## 7 2013 1 7 454 500 -6 637 648 -11 US 1117 N566UW EWR
## 8 2013 1 8 454 500 -6 625 648 -23 US 1117 N564UW EWR
## 9 2013 1 9 457 500 -3 647 648 -1 US 1117 N566UW EWR
## 10 2013 1 10 450 500 -10 634 648 -14 US 1117 N171US EWR
## # ... with 336,766 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# Sort the flights based on their scheduled departure time, and break ties using their actual departure time
arrange(flights, sched_dep_time, dep_time)## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 7 27 NA 106 NA NA 245 NA US 1632 <NA> EWR
## 2 2013 5 8 445 500 -15 620 640 -20 US 1431 N537UW EWR
## 3 2013 5 5 446 500 -14 636 640 -4 US 1579 N768US EWR
## 4 2013 9 4 446 500 -14 618 648 -30 US 1877 N567UW EWR
## 5 2013 10 1 447 500 -13 614 648 -34 US 1877 N538UW EWR
## 6 2013 9 19 447 500 -13 620 648 -28 US 1877 N199UW EWR
## 7 2013 1 29 448 500 -12 635 648 -13 US 1117 N172US EWR
## 8 2013 12 27 448 500 -12 648 651 -3 US 1895 N543UW EWR
## 9 2013 5 7 448 500 -12 624 640 -16 US 1099 N565UW EWR
## 10 2013 10 2 449 500 -11 620 648 -28 US 1877 N193UW EWR
## # ... with 336,766 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# Sort the flights by those scheduled to depart latest, and break ties in that group by those who left earliest
arrange(flights, desc(sched_dep_time), dep_time)## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 11 13 1 2359 2 442 440 2 B6 1503 N627JB JFK
## 2 2013 12 16 1 2359 2 447 437 10 B6 839 N607JB JFK
## 3 2013 12 20 1 2359 2 430 440 -10 B6 1503 N608JB JFK
## 4 2013 12 26 1 2359 2 437 440 -3 B6 1503 N527JB JFK
## 5 2013 12 30 1 2359 2 441 437 4 B6 839 N508JB JFK
## 6 2013 4 5 1 2359 2 410 339 31 B6 727 N606JB JFK
## 7 2013 5 25 1 2359 2 336 341 -5 B6 727 N523JB JFK
## 8 2013 6 20 1 2359 2 340 350 -10 B6 745 N517JB JFK
## 9 2013 7 27 1 2359 2 345 340 5 B6 839 N503JB JFK
## 10 2013 7 28 1 2359 2 423 350 33 B6 745 N703JB JFK
## # ... with 336,766 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights, desc(sched_dep_time), dep_delay)## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 10 2 2341 2359 -18 324 350 -26 B6 745 N527JB JFK
## 2 2013 9 23 2342 2359 -17 331 350 -19 B6 745 N503JB JFK
## 3 2013 10 22 2343 2359 -16 347 350 -3 B6 745 N592JB JFK
## 4 2013 3 4 2343 2359 -16 418 438 -20 B6 727 N603JB JFK
## 5 2013 1 20 2344 2359 -15 428 437 -9 B6 727 N603JB JFK
## 6 2013 4 16 2344 2359 -15 313 343 -30 B6 707 N554JB JFK
## 7 2013 1 27 2345 2359 -14 424 444 -20 B6 739 N630JB JFK
## 8 2013 3 3 2345 2359 -14 441 438 3 B6 727 N508JB JFK
## 9 2013 3 5 2345 2359 -14 439 438 1 B6 727 N705JB JFK
## 10 2013 10 6 2346 2359 -13 333 350 -17 B6 745 N636JB JFK
## # ... with 336,766 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
use select() and rename() to pick variables based on their names
# Select the year, month, day, dep_times, and sched_dep_time columns
select(flights, year, month, day, dep_time, sched_dep_time)## # A tibble: 336,776 x 5
## year month day dep_time sched_dep_time
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 515
## 2 2013 1 1 533 529
## 3 2013 1 1 542 540
## 4 2013 1 1 544 545
## 5 2013 1 1 554 600
## 6 2013 1 1 554 558
## 7 2013 1 1 555 600
## 8 2013 1 1 557 600
## 9 2013 1 1 557 600
## 10 2013 1 1 558 600
## # ... with 336,766 more rows
select(flights, year:sched_dep_time)## # A tibble: 336,776 x 5
## year month day dep_time sched_dep_time
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 515
## 2 2013 1 1 533 529
## 3 2013 1 1 542 540
## 4 2013 1 1 544 545
## 5 2013 1 1 554 600
## 6 2013 1 1 554 558
## 7 2013 1 1 555 600
## 8 2013 1 1 557 600
## 9 2013 1 1 557 600
## 10 2013 1 1 558 600
## # ... with 336,766 more rows
select(flights, 1:5)## # A tibble: 336,776 x 5
## year month day dep_time sched_dep_time
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 515
## 2 2013 1 1 533 529
## 3 2013 1 1 542 540
## 4 2013 1 1 544 545
## 5 2013 1 1 554 600
## 6 2013 1 1 554 558
## 7 2013 1 1 555 600
## 8 2013 1 1 557 600
## 9 2013 1 1 557 600
## 10 2013 1 1 558 600
## # ... with 336,766 more rows
select(flights, -(dep_delay:time_hour)) # more useful when removing only a few columns## # A tibble: 336,776 x 5
## year month day dep_time sched_dep_time
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 515
## 2 2013 1 1 533 529
## 3 2013 1 1 542 540
## 4 2013 1 1 544 545
## 5 2013 1 1 554 600
## 6 2013 1 1 554 558
## 7 2013 1 1 555 600
## 8 2013 1 1 557 600
## 9 2013 1 1 557 600
## 10 2013 1 1 558 600
## # ... with 336,766 more rows
rename lets you change the name of a variable while still keeping the full data set
rename(flights, sun_cycles = year)## # A tibble: 336,776 x 19
## sun_cycles month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA
## # ... with 336,766 more rows, and 7 more variables: origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights # Note that we are not assigning any of these outputs, so if you call the original dataset, it hasn't changed## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
## # ... with 336,766 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
The everything() helper lets you use select to rearrange the order of the variables
select(flights, distance, air_time, everything())## # A tibble: 336,776 x 19
## distance air_time year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
## <dbl> <dbl> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 1400 227 2013 1 1 517 515 2 830 819 11 UA
## 2 1416 227 2013 1 1 533 529 4 850 830 20 UA
## 3 1089 160 2013 1 1 542 540 2 923 850 33 AA
## 4 1576 183 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 762 116 2013 1 1 554 600 -6 812 837 -25 DL
## 6 719 150 2013 1 1 554 558 -4 740 728 12 UA
## 7 1065 158 2013 1 1 555 600 -5 913 854 19 B6
## 8 229 53 2013 1 1 557 600 -3 709 723 -14 EV
## 9 944 140 2013 1 1 557 600 -3 838 846 -8 B6
## 10 733 138 2013 1 1 558 600 -2 753 745 8 AA
## # ... with 336,766 more rows, and 7 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
mutate adds new variables, while transmute drops existing variables
# create a subset of the full dataset so that you can see new variables being added
flights_sml <- select(flights, dep_time, arr_time, air_time, distance)
flights_sml## # A tibble: 336,776 x 4
## dep_time arr_time air_time distance
## <int> <int> <dbl> <dbl>
## 1 517 830 227 1400
## 2 533 850 227 1416
## 3 542 923 160 1089
## 4 544 1004 183 1576
## 5 554 812 116 762
## 6 554 740 150 719
## 7 555 913 158 1065
## 8 557 709 53 229
## 9 557 838 140 944
## 10 558 753 138 733
## # ... with 336,766 more rows
mutate(flights_sml, avg_speed = distance/air_time, dep_hr = dep_time%/%100, dep_min = dep_time%%100)## # A tibble: 336,776 x 7
## dep_time arr_time air_time distance avg_speed dep_hr dep_min
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 517 830 227 1400 6.167401 5 17
## 2 533 850 227 1416 6.237885 5 33
## 3 542 923 160 1089 6.806250 5 42
## 4 544 1004 183 1576 8.612022 5 44
## 5 554 812 116 762 6.568966 5 54
## 6 554 740 150 719 4.793333 5 54
## 7 555 913 158 1065 6.740506 5 55
## 8 557 709 53 229 4.320755 5 57
## 9 557 838 140 944 6.742857 5 57
## 10 558 753 138 733 5.311594 5 58
## # ... with 336,766 more rows
transmute(flights_sml, avg_speed = distance/air_time, dep_hr = dep_time%/%100, dep_min = dep_time%%100)## # A tibble: 336,776 x 3
## avg_speed dep_hr dep_min
## <dbl> <dbl> <dbl>
## 1 6.167401 5 17
## 2 6.237885 5 33
## 3 6.806250 5 42
## 4 8.612022 5 44
## 5 6.568966 5 54
## 6 4.793333 5 54
## 7 6.740506 5 55
## 8 4.320755 5 57
## 9 6.742857 5 57
## 10 5.311594 5 58
## # ... with 336,766 more rows
?mutate for the full suggested list
lead() and lag() find the next and previous values in a vector, respectively.cumsum(), cummean(), and others (see help doc) take running sums, means and other properties# How much time is there between each flight and the next?
mutate(flights_sml, dep_time_offset = lag(dep_time), dep_time_lag = dep_time - lag(dep_time))## # A tibble: 336,776 x 6
## dep_time arr_time air_time distance dep_time_offset dep_time_lag
## <int> <int> <dbl> <dbl> <int> <int>
## 1 517 830 227 1400 NA NA
## 2 533 850 227 1416 517 16
## 3 542 923 160 1089 533 9
## 4 544 1004 183 1576 542 2
## 5 554 812 116 762 544 10
## 6 554 740 150 719 554 0
## 7 555 913 158 1065 554 1
## 8 557 709 53 229 555 2
## 9 557 838 140 944 557 0
## 10 558 753 138 733 557 1
## # ... with 336,766 more rows
mutate(flights_sml, dep_time_offset = lead(dep_time), dep_time_lead = lead(dep_time) - dep_time)## # A tibble: 336,776 x 6
## dep_time arr_time air_time distance dep_time_offset dep_time_lead
## <int> <int> <dbl> <dbl> <int> <int>
## 1 517 830 227 1400 533 16
## 2 533 850 227 1416 542 9
## 3 542 923 160 1089 544 2
## 4 544 1004 183 1576 554 10
## 5 554 812 116 762 554 0
## 6 554 740 150 719 555 1
## 7 555 913 158 1065 557 2
## 8 557 709 53 229 557 0
## 9 557 838 140 944 558 1
## 10 558 753 138 733 558 0
## # ... with 336,766 more rows
mutate(flights_sml, total_dist = cumsum(distance))## # A tibble: 336,776 x 5
## dep_time arr_time air_time distance total_dist
## <int> <int> <dbl> <dbl> <dbl>
## 1 517 830 227 1400 1400
## 2 533 850 227 1416 2816
## 3 542 923 160 1089 3905
## 4 544 1004 183 1576 5481
## 5 554 812 116 762 6243
## 6 554 740 150 719 6962
## 7 555 913 158 1065 8027
## 8 557 709 53 229 8256
## 9 557 838 140 944 9200
## 10 558 753 138 733 9933
## # ... with 336,766 more rows
Summarise reduces multiple values to a single summary metric
summarise(flights, delay = mean(dep_delay))## # A tibble: 1 x 1
## delay
## <dbl>
## 1 NA
NA?NA (not available)
NaN (not a number), which refers to impossible values, e.g., dividing by zeroNA values, the output will always be NA
NA’s as being any possible value, as a result any summary metric will result in an unknown quantity as the uknown NA value could have significantly impacted the resultsNA value
na.rm=TRUEsummarise(flights, delay = mean(dep_delay, na.rm = TRUE))## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.63907
summarise isn’t that useful as we rarely want to reduce all our data down to a single metricsummarise is much more useful when combined with the other expressions
jan_delay <- filter(flights, month == 1)
summarise(jan_delay, delay = mean(dep_delay, na.rm = TRUE))## # A tibble: 1 x 1
## delay
## <dbl>
## 1 10.03667
summarise is seen when coupled with group_bygroup_by takes an existing tibble and converts it into a grouped tibble where operations are performed “by group”
group_by does not change how the data looks, instead it changes how it interacts with the other verbs, most notably summariseflights_month <- group_by(flights, month)
flights_month## # A tibble: 336,776 x 19
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR
## 2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA
## 3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
## 5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
## 6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
## 7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
## 8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA
## 9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
## 10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
## # ... with 336,766 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
summarise(flights_month, delay = mean(dep_delay, na.rm = TRUE))## # A tibble: 12 x 2
## month delay
## <int> <dbl>
## 1 1 10.036665
## 2 2 10.816843
## 3 3 13.227076
## 4 4 13.938038
## 5 5 12.986859
## 6 6 20.846332
## 7 7 21.727787
## 8 8 12.611040
## 9 9 6.722476
## 10 10 6.243988
## 11 11 5.435362
## 12 12 16.576688
Also it’s good practice when grouping to add a counts column using n()
summarise(flights_month, delay = mean(dep_delay, na.rm = TRUE), count = n())## # A tibble: 12 x 3
## month delay count
## <int> <dbl> <int>
## 1 1 10.036665 27004
## 2 2 10.816843 24951
## 3 3 13.227076 28834
## 4 4 13.938038 28330
## 5 5 12.986859 28796
## 6 6 20.846332 28243
## 7 7 21.727787 29425
## 8 8 12.611040 29327
## 9 9 6.722476 27574
## 10 10 6.243988 28889
## 11 11 5.435362 27268
## 12 12 16.576688 28135
Often you will need to string multiple actions together which can get somewhat messy
# On average which hour of the day has the most delayed american airline flights
flights_mut <- mutate(flights, hr = sched_dep_time%/%100)
flights_filt <- filter(flights_mut, carrier == "AA", complete.cases(flights_mut))
flights_sel <- select(flights_filt, dep_time, hr, sched_dep_time, dep_delay)
flights_sel # print out to confirm that you are selecting what you intend## # A tibble: 31,947 x 4
## dep_time hr sched_dep_time dep_delay
## <int> <dbl> <int> <dbl>
## 1 542 5 540 2
## 2 558 6 600 -2
## 3 559 6 600 -1
## 4 606 6 610 -4
## 5 623 6 610 13
## 6 628 6 630 -2
## 7 629 6 630 -1
## 8 635 6 635 0
## 9 656 7 700 -4
## 10 656 6 659 -3
## # ... with 31,937 more rows
flights_gb <- group_by(flights_sel, hr)
flights_sum <- summarise(flights_gb, mean_delay = mean(dep_delay), count = n())
flights_arr <- arrange(flights_sum, desc(mean_delay))
print(flights_arr, n = 24)## # A tibble: 17 x 3
## hr mean_delay count
## <dbl> <dbl> <int>
## 1 19 21.2870418 1937
## 2 17 19.9348861 3993
## 3 21 18.2857143 406
## 4 18 14.3431713 1728
## 5 15 12.3666921 2618
## 6 20 12.2851240 484
## 7 16 11.9728287 2061
## 8 14 7.7936709 1580
## 9 13 7.3642305 1683
## 10 10 5.8927536 1380
## 11 11 5.1475167 1349
## 12 12 4.9592920 2260
## 13 9 2.6576763 2410
## 14 8 2.1545312 1909
## 15 5 0.6814404 361
## 16 7 -0.4511713 3287
## 17 6 -1.0215914 2501
The pipes %>% or CTRL+Shift+M (from the magrittr package which is included in tidyverse) allows you to do the same set of actions in a much simpler manner
flights %>%
mutate(hr = sched_dep_time%/% 100) %>%
filter(carrier == 'AA', complete.cases(flights_mut)) %>%
select(dep_time, hr, sched_dep_time, dep_delay) %>%
group_by(hr) %>%
summarise(mean_delay = mean(dep_delay),count = n()) %>%
arrange(desc(mean_delay))## # A tibble: 17 x 3
## hr mean_delay count
## <dbl> <dbl> <int>
## 1 19 21.2870418 1937
## 2 17 19.9348861 3993
## 3 21 18.2857143 406
## 4 18 14.3431713 1728
## 5 15 12.3666921 2618
## 6 20 12.2851240 484
## 7 16 11.9728287 2061
## 8 14 7.7936709 1580
## 9 13 7.3642305 1683
## 10 10 5.8927536 1380
## 11 11 5.1475167 1349
## 12 12 4.9592920 2260
## 13 9 2.6576763 2410
## 14 8 2.1545312 1909
## 15 5 0.6814404 361
## 16 7 -0.4511713 3287
## 17 6 -1.0215914 2501
The fivethirtyeight package has lots of fun datasets. To see the motivation behind the package’s creation as well as a description of each of the different datasets use vignette("fivethirtyeight", package = "fivethirtyeight")
# Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name
# 1893,2017-03-01 00:00:32,2017-03-01 00:32:06,2009,Catherine St & Monroe St
# 223,2017-03-01 00:01:09,2017-03-01 00:04:53,127,Barrow St & Hudson St
# 1665,2017-03-01 00:01:27,2017-03-01 00:29:12,174,E 25 St & 1 Ave
# 100,2017-03-01 00:01:29,2017-03-01 00:03:10,316,Fulton St & William St
# 1229,2017-03-01 00:01:33,2017-03-01 00:22:02,536,1 Ave & E 30 St
# 613,2017-03-01 00:01:57,2017-03-01 00:12:11,259,South St & Whitehall St
# 157,2017-03-01 00:02:12,2017-03-01 00:04:49,3329,Degraw St & Smith Stread.csv() which reads the data into a dataframe, but instead we will be using the tidyverse package readr, which uses the slightly different read_csv()
read_csv function is powerful and versatile it’s good to look at its default options before using it# read_csv(file, col_names = TRUE, col_types = NULL,
# locale = default_locale(), na = c("", "NA"), quoted_na = TRUE,
# quote = "\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf,
# guess_max = min(1000, n_max), progress = show_progress()) read_csv("./citibike_data/201701-citibike-tripdata.csv")## Parsed with column specification:
## cols(
## `Trip Duration` = col_integer(),
## `Start Time` = col_datetime(format = ""),
## `Stop Time` = col_datetime(format = ""),
## `Start Station ID` = col_integer(),
## `Start Station Name` = col_character(),
## `Start Station Latitude` = col_double(),
## `Start Station Longitude` = col_double(),
## `End Station ID` = col_integer(),
## `End Station Name` = col_character(),
## `End Station Latitude` = col_double(),
## `End Station Longitude` = col_double(),
## `Bike ID` = col_integer(),
## `User Type` = col_character(),
## `Birth Year` = col_integer(),
## Gender = col_integer()
## )
## # A tibble: 726,676 x 15
## `Trip Duration` `Start Time` `Stop Time` `Start Station ID` `Start Station Name`
## <int> <dttm> <dttm> <int> <chr>
## 1 680 2017-01-01 00:00:21 2017-01-01 00:11:41 3226 W 82 St & Central Park West
## 2 1282 2017-01-01 00:00:45 2017-01-01 00:22:08 3263 Cooper Square & E 7 St
## 3 648 2017-01-01 00:00:57 2017-01-01 00:11:46 3143 5 Ave & E 78 St
## 4 631 2017-01-01 00:01:10 2017-01-01 00:11:42 3143 5 Ave & E 78 St
## 5 621 2017-01-01 00:01:25 2017-01-01 00:11:47 3143 5 Ave & E 78 St
## 6 666 2017-01-01 00:01:51 2017-01-01 00:12:57 3163 Central Park West & W 68 St
## 7 559 2017-01-01 00:05:00 2017-01-01 00:14:20 499 Broadway & W 60 St
## 8 826 2017-01-01 00:05:37 2017-01-01 00:19:24 362 Broadway & W 37 St
## 9 255 2017-01-01 00:05:47 2017-01-01 00:10:02 430 York St & Jay St
## 10 634 2017-01-01 00:07:34 2017-01-01 00:18:08 3165 Central Park West & W 72 St
## # ... with 726,666 more rows, and 10 more variables: `Start Station Latitude` <dbl>, `Start Station Longitude` <dbl>,
## # `End Station ID` <int>, `End Station Name` <chr>, `End Station Latitude` <dbl>, `End Station Longitude` <dbl>,
## # `Bike ID` <int>, `User Type` <chr>, `Birth Year` <int>, Gender <int>
You can see that R solved the problem of spaces in the column names by putting ` around them (note this is a backtick not a single quote), but this makes it annoying to refer to these variables. Instead we can use a snake case version of the column names as an input into read_csv
column_names <- c("trip_duration", "start_time", "stop_time", "start_id", "start_name", "start_lat", "start_long", "end_id",
"end_name", "end_lat", "end_long", "bike_id", "user_type", "birth_year", "gender")
read_csv("./citibike_data/201701-citibike-tripdata.csv", col_names = column_names)## Parsed with column specification:
## cols(
## trip_duration = col_character(),
## start_time = col_character(),
## stop_time = col_character(),
## start_id = col_character(),
## start_name = col_character(),
## start_lat = col_character(),
## start_long = col_character(),
## end_id = col_character(),
## end_name = col_character(),
## end_lat = col_character(),
## end_long = col_character(),
## bike_id = col_character(),
## user_type = col_character(),
## birth_year = col_character(),
## gender = col_character()
## )
## # A tibble: 726,677 x 15
## trip_duration start_time stop_time start_id start_name
## <chr> <chr> <chr> <chr> <chr>
## 1 Trip Duration Start Time Stop Time Start Station ID Start Station Name
## 2 680 2017-01-01 00:00:21 2017-01-01 00:11:41 3226 W 82 St & Central Park West
## 3 1282 2017-01-01 00:00:45 2017-01-01 00:22:08 3263 Cooper Square & E 7 St
## 4 648 2017-01-01 00:00:57 2017-01-01 00:11:46 3143 5 Ave & E 78 St
## 5 631 2017-01-01 00:01:10 2017-01-01 00:11:42 3143 5 Ave & E 78 St
## 6 621 2017-01-01 00:01:25 2017-01-01 00:11:47 3143 5 Ave & E 78 St
## 7 666 2017-01-01 00:01:51 2017-01-01 00:12:57 3163 Central Park West & W 68 St
## 8 559 2017-01-01 00:05:00 2017-01-01 00:14:20 499 Broadway & W 60 St
## 9 826 2017-01-01 00:05:37 2017-01-01 00:19:24 362 Broadway & W 37 St
## 10 255 2017-01-01 00:05:47 2017-01-01 00:10:02 430 York St & Jay St
## # ... with 726,667 more rows, and 10 more variables: start_lat <chr>, start_long <chr>, end_id <chr>, end_name <chr>,
## # end_lat <chr>, end_long <chr>, bike_id <chr>, user_type <chr>, birth_year <chr>, gender <chr>
The problem is that when you provide the column names, read_csv treats the first row as a regular row of entries, so if we want to use our own column names we need to skip that row
citi_bike <- read_csv("./citibike_data/201701-citibike-tripdata.csv", col_names = column_names, skip = 1)## Parsed with column specification:
## cols(
## trip_duration = col_integer(),
## start_time = col_datetime(format = ""),
## stop_time = col_datetime(format = ""),
## start_id = col_integer(),
## start_name = col_character(),
## start_lat = col_double(),
## start_long = col_double(),
## end_id = col_integer(),
## end_name = col_character(),
## end_lat = col_double(),
## end_long = col_double(),
## bike_id = col_integer(),
## user_type = col_character(),
## birth_year = col_integer(),
## gender = col_integer()
## )
citi_bike## # A tibble: 726,676 x 15
## trip_duration start_time stop_time start_id start_name start_lat start_long
## <int> <dttm> <dttm> <int> <chr> <dbl> <dbl>
## 1 680 2017-01-01 00:00:21 2017-01-01 00:11:41 3226 W 82 St & Central Park West 40.78275 -73.97137
## 2 1282 2017-01-01 00:00:45 2017-01-01 00:22:08 3263 Cooper Square & E 7 St 40.72924 -73.99087
## 3 648 2017-01-01 00:00:57 2017-01-01 00:11:46 3143 5 Ave & E 78 St 40.77683 -73.96389
## 4 631 2017-01-01 00:01:10 2017-01-01 00:11:42 3143 5 Ave & E 78 St 40.77683 -73.96389
## 5 621 2017-01-01 00:01:25 2017-01-01 00:11:47 3143 5 Ave & E 78 St 40.77683 -73.96389
## 6 666 2017-01-01 00:01:51 2017-01-01 00:12:57 3163 Central Park West & W 68 St 40.77341 -73.97783
## 7 559 2017-01-01 00:05:00 2017-01-01 00:14:20 499 Broadway & W 60 St 40.76916 -73.98192
## 8 826 2017-01-01 00:05:37 2017-01-01 00:19:24 362 Broadway & W 37 St 40.75173 -73.98754
## 9 255 2017-01-01 00:05:47 2017-01-01 00:10:02 430 York St & Jay St 40.70149 -73.98657
## 10 634 2017-01-01 00:07:34 2017-01-01 00:18:08 3165 Central Park West & W 72 St 40.77579 -73.97621
## # ... with 726,666 more rows, and 8 more variables: end_id <int>, end_name <chr>, end_lat <dbl>, end_long <dbl>,
## # bike_id <int>, user_type <chr>, birth_year <int>, gender <int>
read_csv and similar functions, is the method of analyzing elements of a vector to determine the type of information within that vectorread_csv uses the first 1000 rows or the entire dataset, whichever is smaller, to parse each column# Let's look again at our dataset and see if all our datatypes make sense
citi_bike## # A tibble: 726,676 x 15
## trip_duration start_time stop_time start_id start_name start_lat start_long end_id end_name end_lat end_long bike_id user_type birth_year gender
## <int> <dttm> <dttm> <int> <chr> <dbl> <dbl> <int> <chr> <dbl> <dbl> <int> <chr> <int> <int>
## 1 680 2017-01-01 00:00:21 2017-01-01 00:11:41 3226 W 82 St & Central Park West 40.78275 -73.97137 3165 Central Park West & W 72 St 40.77579 -73.97621 25542 Subscriber 1965 2
## 2 1282 2017-01-01 00:00:45 2017-01-01 00:22:08 3263 Cooper Square & E 7 St 40.72924 -73.99087 498 Broadway & W 32 St 40.74855 -73.98808 21136 Subscriber 1987 2
## 3 648 2017-01-01 00:00:57 2017-01-01 00:11:46 3143 5 Ave & E 78 St 40.77683 -73.96389 3152 3 Ave & E 71 St 40.76874 -73.96120 18147 Customer NA 0
## 4 631 2017-01-01 00:01:10 2017-01-01 00:11:42 3143 5 Ave & E 78 St 40.77683 -73.96389 3152 3 Ave & E 71 St 40.76874 -73.96120 21211 Customer NA 0
## 5 621 2017-01-01 00:01:25 2017-01-01 00:11:47 3143 5 Ave & E 78 St 40.77683 -73.96389 3152 3 Ave & E 71 St 40.76874 -73.96120 26819 Customer NA 0
## 6 666 2017-01-01 00:01:51 2017-01-01 00:12:57 3163 Central Park West & W 68 St 40.77341 -73.97783 3163 Central Park West & W 68 St 40.77341 -73.97783 16050 Subscriber 2000 1
## 7 559 2017-01-01 00:05:00 2017-01-01 00:14:20 499 Broadway & W 60 St 40.76916 -73.98192 479 9 Ave & W 45 St 40.76019 -73.99126 27294 Subscriber 1973 1
## 8 826 2017-01-01 00:05:37 2017-01-01 00:19:24 362 Broadway & W 37 St 40.75173 -73.98754 445 E 10 St & Avenue A 40.72741 -73.98142 23288 Subscriber 1977 2
## 9 255 2017-01-01 00:05:47 2017-01-01 00:10:02 430 York St & Jay St 40.70149 -73.98657 242 Carlton Ave & Flushing Ave 40.69779 -73.97374 25041 Subscriber 1989 1
## 10 634 2017-01-01 00:07:34 2017-01-01 00:18:08 3165 Central Park West & W 72 St 40.77579 -73.97621 3164 Columbus Ave & W 72 St 40.77706 -73.97898 16311 Subscriber 1980 1
## # ... with 726,666 more rows
summary(citi_bike)## trip_duration start_time stop_time start_id start_name
## Min. : 61 Min. :2017-01-01 00:00:21 Min. :2017-01-01 00:10:02 Min. : 72 Length:726676
## 1st Qu.: 331 1st Qu.:2017-01-11 08:40:46 1st Qu.:2017-01-11 08:52:47 1st Qu.: 358 Class :character
## Median : 526 Median :2017-01-18 09:25:48 Median :2017-01-18 09:37:23 Median : 482 Mode :character
## Mean : 778 Mean :2017-01-17 16:36:15 Mean :2017-01-17 16:49:13 Mean :1223
## 3rd Qu.: 860 3rd Qu.:2017-01-25 15:39:17 3rd Qu.:2017-01-25 15:52:34 3rd Qu.:3092
## Max. :5325688 Max. :2017-01-31 23:59:23 Max. :2017-03-14 14:13:45 Max. :3446
##
## start_lat start_long end_id end_name end_lat end_long bike_id
## Min. : 0.00 Min. :-74.03 Min. : 72 Length:726676 Min. : 0.00 Min. :-74.03 Min. :14529
## 1st Qu.:40.72 1st Qu.:-74.00 1st Qu.: 356 Class :character 1st Qu.:40.72 1st Qu.:-74.00 1st Qu.:17859
## Median :40.74 Median :-73.99 Median : 479 Mode :character Median :40.74 Median :-73.99 Median :21295
## Mean :40.74 Mean :-73.98 Mean :1197 Mean :40.74 Mean :-73.99 Mean :21713
## 3rd Qu.:40.76 3rd Qu.:-73.98 3rd Qu.:3078 3rd Qu.:40.76 3rd Qu.:-73.98 3rd Qu.:25803
## Max. :40.80 Max. : 0.00 Max. :3447 Max. :40.80 Max. : 0.00 Max. :27325
##
## user_type birth_year gender
## Length:726676 Min. :1885 Min. :0.000
## Class :character 1st Qu.:1969 1st Qu.:1.000
## Mode :character Median :1979 Median :1.000
## Mean :1977 Mean :1.167
## 3rd Qu.:1987 3rd Qu.:1.000
## Max. :2000 Max. :2.000
## NA's :29076
level() function list all the possible options of the factor variablecol_factor() functionas.factor() function# In the citi_bike case start_id, end_id, user_type and gender are all variables which should be factors
citi_bike_fac <- citi_bike %>% mutate(start_id = as.factor(start_id), end_id = as.factor(end_id), user_type = as.factor(user_type),
gender = as.factor(gender))
citi_bike_fac## # A tibble: 726,676 x 15
## trip_duration start_time stop_time start_id start_name start_lat start_long
## <int> <dttm> <dttm> <fctr> <chr> <dbl> <dbl>
## 1 680 2017-01-01 00:00:21 2017-01-01 00:11:41 3226 W 82 St & Central Park West 40.78275 -73.97137
## 2 1282 2017-01-01 00:00:45 2017-01-01 00:22:08 3263 Cooper Square & E 7 St 40.72924 -73.99087
## 3 648 2017-01-01 00:00:57 2017-01-01 00:11:46 3143 5 Ave & E 78 St 40.77683 -73.96389
## 4 631 2017-01-01 00:01:10 2017-01-01 00:11:42 3143 5 Ave & E 78 St 40.77683 -73.96389
## 5 621 2017-01-01 00:01:25 2017-01-01 00:11:47 3143 5 Ave & E 78 St 40.77683 -73.96389
## 6 666 2017-01-01 00:01:51 2017-01-01 00:12:57 3163 Central Park West & W 68 St 40.77341 -73.97783
## 7 559 2017-01-01 00:05:00 2017-01-01 00:14:20 499 Broadway & W 60 St 40.76916 -73.98192
## 8 826 2017-01-01 00:05:37 2017-01-01 00:19:24 362 Broadway & W 37 St 40.75173 -73.98754
## 9 255 2017-01-01 00:05:47 2017-01-01 00:10:02 430 York St & Jay St 40.70149 -73.98657
## 10 634 2017-01-01 00:07:34 2017-01-01 00:18:08 3165 Central Park West & W 72 St 40.77579 -73.97621
## # ... with 726,666 more rows, and 8 more variables: end_id <fctr>, end_name <chr>, end_lat <dbl>, end_long <dbl>,
## # bike_id <int>, user_type <fctr>, birth_year <int>, gender <fctr>
levels(citi_bike_fac$gender)## [1] "0" "1" "2"